The code for this report is on github here.

> library(useful)
> library(readr)
> library(dplyr)
> library(tidyr)
> library(ggplot2)
> library(stringr)

Read in the data for each plate into one big dataframe, make unique IDs for each batch/well combination and use those IDs as the column names.

> plate_reader = function(fn) {
+     batch = strsplit(basename(fn), ".", fixed = TRUE)[[1]][1]
+     data = read.table(fn, header = TRUE, row.names = 1)
+     colnames(data) = paste(batch, colnames(data), sep = "_")
+     data
+ }
> plates_fns = sort(list.files("data", pattern = "*.dat", full.names = TRUE))
> plates = do.call(cbind, lapply(plates_fns, plate_reader))

Create a dataframe of the metadata about each sample. This has an identifier for a sample, Which well it came from, which batch and what it was treated with.

> welldata_fn = "metadata/Compound Layout 384w.csv"
> welldata = read_csv(welldata_fn) %>% gather(column, treatment, -row) %>% mutate(well = paste(row, 
+     column, sep = "")) %>% dplyr::select(well, treatment)
> identities = data.frame(str_split_fixed(colnames(plates), "_", 3))
> colnames(identities) = c("batch", "drop", "well")
> identities$id = colnames(plates)
> welldata = identities %>% dplyr::select(batch, well) %>% left_join(welldata, 
+     by = "well")
> rownames(welldata) = colnames(plates)
> welldata$sample = colnames(plates)
> welldata$classes = ifelse(welldata$treatment == "DMSO", "DMSO", ifelse(welldata$treatment == 
+     "AdMyoD", "AdMyoD", "other"))

Verify that the samples match up between the read counts and the metadata dataframe and make sure there are no NA counts.

> dim(welldata)
[1] 1920    5
> dim(plates)
[1] 23481  1920
> table(rownames(welldata) %in% colnames(plates))

TRUE 
1920 
> corner(plates)
              M1_T384s1_A1 M1_T384s1_A2 M1_T384s1_A3 M1_T384s1_A4
0610005C13Rik            0            0            0            0
0610007N19Rik            2            5            2           12
0610007P14Rik            1            4            0           14
0610008F07Rik            0            0            0            0
0610009B14Rik            0            0            0            0
              M1_T384s1_A5
0610005C13Rik            0
0610007N19Rik            9
0610007P14Rik            8
0610008F07Rik            0
0610009B14Rik            0
> corner(welldata)
             batch well              treatment       sample classes
M1_T384s1_A1    M1   A1                   DMSO M1_T384s1_A1    DMSO
M1_T384s1_A2    M1   A2   ABT-263 (Navitoclax) M1_T384s1_A2   other
M1_T384s1_A3    M1   A3              TG100-115 M1_T384s1_A3   other
M1_T384s1_A4    M1   A4 Lenalidomide (CC-5013) M1_T384s1_A4   other
M1_T384s1_A5    M1   A5          Oxcarbazepine M1_T384s1_A5   other
> table(complete.cases(plates))

 TRUE 
23481 

Looks like we are good to go. While we’re at it we will load the positive control data.

> positive_fn = "data/Feo_positive_controls.unq.refseq.umi.SC_and_Myoblast_Raw.txt"
> positive = read.table(positive_fn, header = TRUE, row.names = rownames(plates))
> positive$id = NULL
> positive_samples = data.frame(str_split_fixed(colnames(positive), "_", 3))
> identities = data.frame(str_split_fixed(colnames(plates), "_", 3))$X1
> positive_welldata = data.frame(batch = positive_samples$X1, treatment = positive_samples$X1, 
+     well = positive_samples$X1)
> rownames(positive_welldata) = colnames(positive)

Now we will calculate some summary statistics about each sample.

> welldata$genes_detected = colSums(plates > 0)
> welldata$genes_detected_zscore = ave(welldata$genes_detected, FUN = scale)
> welldata$genes_detected_pval = 2 * pnorm(-abs(welldata$genes_detected_zscore))
> welldata$genes_detected_padj = p.adjust(welldata$genes_detected_pval, method = "BH")

We see around 10k genes detected in each cell on each plate, with some cells with a low number of genes detected on each plate.

> ggplot(welldata, aes(batch, genes_detected)) + geom_boxplot() + ylab("genes with counts > 0") + 
+     xlab("") + theme_bw()

> welldata$counts = colSums(plates)
> ggplot(welldata, aes(batch, counts)) + geom_boxplot() + ylab("total counts") + 
+     xlab("") + theme_bw()

> ggplot(welldata, aes(counts, genes_detected, color = batch)) + geom_smooth(fill = NA) + 
+     ylab("genes with counts > 0") + xlab("total counts") + theme_bw()

In this histogram of the genes detected, we can see there are a set of cells with a low amount of genes detected, more in the M2 plate than the other plates but for the most part the plates look pretty similar.

> ggplot(welldata, aes(genes_detected)) + geom_histogram() + theme_bw() + xlab("genes with counts > 0") + 
+     facet_wrap(~batch)

Here we drop the cells with less than 7,500 genes detected.

> welldata = subset(welldata, genes_detected > 7500)
> plates = plates[, rownames(welldata)]
> library(biomaRt)
> mouse = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", 
+     host = "jul2015.archive.ensembl.org")
> conversions = getBM(attributes = c("ensembl_gene_id", "mgi_symbol", "gene_biotype"), 
+     mart = mouse)

Samples do not have very many counts in noise genes, so that is not an issue. Noise genes are flagged by either being too small to be picked up reliably in a standard RNA-seq analysis or are highly variable and prone to introducing distortion such as rRNA.

> biotypes = unique(conversions$gene_biotype)
> noise_rna_biotypes = c("Mt_tRNA", "Mt_rRNA", "snoRNA", "snRNA", "misc_RNA", 
+     "scaRNA", "rRNA", "sRNA")
> noise_rna_genes = subset(conversions, gene_biotype %in% noise_rna_biotypes)$mgi_symbol
> noise_rna = rownames(plates)[rownames(plates) %in% noise_rna_genes]
> welldata$noise_counts = colSums(plates[noise_rna, ])
> ggplot(welldata, aes(batch, noise_counts)) + geom_boxplot() + ylab("counts in noise genes") + 
+     xlab("") + theme_bw()

We’ll drop the noise genes from consideration even though there aren’t many counts in them.

> plates = plates[!rownames(plates) %in% noise_rna_genes, ]

We’ll also drop all genes with that don’t have at least 100 counts total and are not seen in at least 4 samples. This cuts down the number of genes we are considering to ~13,500.

> plates = plates[rowSums(plates > 0) > 4 & rowSums(plates) > 100, ]

This is what we’re left with in terms of samples:

> knitr::kable(welldata %>% group_by(batch) %>% summarize(total = n()))
batch total
M1 302
M2 295
M3 369
M4 359
M5 358

and we’re left with 13488 genes to consider.

> library(Seurat)
> seurat.data = new("seurat", raw.data = plates)
> seurat.data = Setup(seurat.data, project = "rubin", min.cells = 3, min.genes = 1000, 
+     meta.data = welldata, total.expr = 10000)
[1] "Performing log-normalization"
[1] "Scaling data matrix"

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%

Here we look at what are the most variable genes across the samples. We can see a lot of these are subunits of ribosomal proteins, these will be used for the PCA.

> seurat.data = MeanVarPlot(seurat.data, y.cutoff = 0.5, x.low.cutoff = 0.0125, 
+     x.high.cutoff = 3, fxn.x = expMean, fxn.y = logVarDivMean)
[1] "Calculating gene dispersion"

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%

We’ll focus on component 1 since that seems to be what separates out the batch 3 plate from the other plates.

> seurat.data = PCA(seurat.data, do.print = FALSE)
> PCAPlot(seurat.data, 1, 2, pt.size = 2)

> rot = seurat.data@pca.rot %>% tibble::rownames_to_column(var = "sample") %>% 
+     left_join(welldata, by = "sample")
> ggplot(rot, aes(PC1, PC2, shape = classes, color = batch)) + geom_point(size = 2) + 
+     theme_bw()

There are quite a few ribosomal proteins that are flagged. These should for the most part eithr be stably expressed or are not particularly interesting hits.

Since TCL3 is so different, and it seems different in non-interesting ways, let’s just drop it for now to make our lives easier. We can see that after dropping TCL3 we see a separation of the AdMyoD samples from the other samples.

> welldata = subset(welldata, batch != "M3")
> plates = plates[, rownames(welldata)]
> seurat.data = new("seurat", raw.data = plates)
> seurat.data = Setup(seurat.data, project = "rubin", min.cells = 3, min.genes = 200, 
+     total.expr = 10000, meta.data = welldata)
[1] "Performing log-normalization"
[1] "Scaling data matrix"

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%
> seurat.data = MeanVarPlot(seurat.data, y.cutoff = 0.5, x.low.cutoff = 0.0125, 
+     x.high.cutoff = 3, do.contour = F, fxn.x = expMean, fxn.y = logVarDivMean)
[1] "Calculating gene dispersion"

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%

> seurat.data = PCA(seurat.data, do.print = FALSE)
> PCAPlot(seurat.data, 1, 2, pt.size = 2)

> seurat.data = ProjectPCA(seurat.data, do.print = FALSE)

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%
> VizPCA(seurat.data, pcs.use = 1:3, num.genes = 10, use.full = TRUE, nCol = 3)

> rot = seurat.data@pca.rot %>% tibble::rownames_to_column(var = "sample") %>% 
+     left_join(welldata, by = "sample")
> ggplot(rot, aes(PC1, PC2, color = batch, label = treatment)) + geom_text(size = 2) + 
+     theme_bw()

We can see a batch effect in PC3 and PC4, which shows that we should make sure that we are controlling for batch effects when we do thes experiments.

> ggplot(rot, aes(PC3, PC4, color = batch, label = treatment)) + geom_text(size = 2) + 
+     theme_bw()

Going back to PC1 and PC2, we see there are some chemicals that seem to move the cells towards the AdMyoD treated cells. Zooming in on that area shows some more candidates:

> ggplot(rot, aes(PC1, PC2, color = batch, label = treatment)) + coord_cartesian(xlim = c(-30, 
+     0), ylim = c(-25, 0)) + geom_text(size = 2) + theme_bw()

Distance to AdMyoD samples

Here we take a different tactic and find the genes that are most different between the AdMyoD samples and the DMSO treated samples. Then we will use those genes to measure how far each of the treated samples is from the AdMyoD samples in terms of expression of those genes.

> md_samples = rownames(subset(welldata, treatment %in% c("AdMyoD", "DMSO")))
> md_metadata = welldata[md_samples, ]
> md_counts = plates[, md_samples]

We fit a model that tests for differences between AdMyoD and DMSO treatment while controlling for batch effects.

> library(DESeq2)
> dds = DESeqDataSetFromMatrix(countData = md_counts, colData = md_metadata, design = ~batch + 
+     treatment)
> dds = DESeq(dds)
> plotDispEsts(dds)

> res = results(dds)
> sig = subset(res, padj < 0.05) %>% data.frame() %>% tibble::rownames_to_column(var = "gene") %>% 
+     dplyr::arrange(padj)

There are 1524 genes tagged as differentially expressed between the AdMyoD and the DMSO treated samples. We’ll use those genes to measure how far the chemical treated samples are from the AdMyoD treated samples. This gives different results than the PCA method we were using:

> comp1class = welldata[, c("sample", "treatment", "classes")]
> colnames(comp1class) = c("sample", "comp1treat", "comp1class")
> comp2class = welldata[, c("sample", "treatment", "classes")]
> colnames(comp2class) = c("sample", "comp2treat", "comp2class")
> dds = DESeqDataSetFromMatrix(countData = plates, colData = welldata, design = ~batch + 
+     treatment)
> dds = estimateSizeFactors(dds)
> ncounts = log(counts(dds, normalized = TRUE) + 1)
> dists = as.matrix(dist(t(ncounts[sig$gene, ])))
> dists[diag(dists)] = NA
> dists = dists %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>% 
+     tidyr::gather(sample, distance, -rowname)
> colnames(dists) = c("comp1", "comp2", "distance")
> dists = dists %>% left_join(comp1class, by = c(comp1 = "sample")) %>% left_join(comp2class, 
+     by = c(comp2 = "sample")) %>% group_by(comp1treat, comp2treat) %>% summarise(mtreatdist = mean(distance), 
+     mtreatsd = sd(distance))
> myod = subset(dists, comp1treat == "AdMyoD") %>% dplyr::arrange(mtreatdist) %>% 
+     mutate(rank = dense_rank(mtreatdist))
> ggplot(myod, aes(rank, mtreatdist, label = comp2treat)) + geom_point() + theme(axis.title.x = element_blank(), 
+     axis.text.x = element_blank(), axis.ticks.x = element_blank()) + ylab("Euclidean distance to AdMyoD")

> write_csv(myod, "normalized-count-distance-to-admyod.csv")

PCA distances

Another way is to just look at the We can also calculate the distances between points on PC1 and PC2 to get an estimate of how similar the samples are to each other.

> pcamat = as.matrix(rot[, c("PC1", "PC2")])
> rownames(pcamat) = rot$sample
> pcadist = as.matrix(dist(pcamat))
> pcadist[diag(pcadist)] = NA
> pcadist = pcadist %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>% 
+     tidyr::gather(sample, distance, -rowname)
> colnames(pcadist) = c("comp1", "comp2", "distance")
> pcadist = pcadist %>% left_join(comp1class, by = c(comp1 = "sample")) %>% left_join(comp2class, 
+     by = c(comp2 = "sample")) %>% group_by(comp1treat, comp2treat) %>% summarise(mtreatdist = mean(distance), 
+     mtreatsd = sd(distance))
> myod = subset(pcadist, comp1treat == "AdMyoD") %>% dplyr::arrange(mtreatdist) %>% 
+     mutate(rank = dense_rank(mtreatdist))
> ggplot(myod, aes(rank, mtreatdist, label = comp2treat)) + geom_point() + theme(axis.title.x = element_blank(), 
+     axis.text.x = element_blank(), axis.ticks.x = element_blank()) + ylab("Euclidean distance to AdMyoD")

> write_csv(myod, "pca-distance-to-admyod.csv")
> comp1class = welldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comp1class) = c("sample", "comp1treat", "comp1class", "batch")
> comp2class = welldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comp2class) = c("sample", "comp2treat", "comp2class", "batch")
> pcamat = as.matrix(rot[, c("PC1", "PC2")])
> rownames(pcamat) = rot$sample
> pcadist = as.matrix(dist(pcamat))
> pcadist[diag(pcadist)] = NA
> pcadist = pcadist %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>% 
+     tidyr::gather(sample, distance, -rowname)
> colnames(pcadist) = c("comp1", "comp2", "distance")
> pcadist = pcadist %>% left_join(comb1class, by = c(comp1 = "sample")) %>% left_join(comb2class, 
+     by = c(comp2 = "sample")) %>% group_by(comb1treat, comb2treat, batch.y) %>% 
+     summarise(mtreatdist = mean(distance), mtreatsd = sd(distance))
> scall = subset(pcadist, comb1treat == "AdMyoD") %>% dplyr::arrange(mtreatdist)
> write_csv(scall, "pca-distance-to-admyod-by-batch.csv")

Positive control genes

Here we can see along PC1 a separation of the SC and Myo samples, the AdMyoD separation is along component 2. We have no way of determining though if this separation is due to batch differences or due to actual biological signal of SC and Myo.

> positive = positive[rownames(plates), ]
> combwelldata = rbind(positive_welldata, welldata[, colnames(positive_welldata)])
> combwelldata$classes = ifelse(combwelldata$treatment == "DMSO", "DMSO", ifelse(welldata$treatment == 
+     "SC", "SC", "other"))
> combwelldata$sample = rownames(combwelldata)
> combwell = cbind(positive, plates)
> library(Seurat)
> combined.data = new("seurat", raw.data = combwell)
> combined.data = Setup(combined.data, project = "rubin", min.cells = 3, min.genes = 1000, 
+     meta.data = combwelldata, total.expr = 10000)
[1] "Performing log-normalization"
[1] "Scaling data matrix"

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%
> combined.data = MeanVarPlot(combined.data, y.cutoff = 0.5, x.low.cutoff = 0.0125, 
+     x.high.cutoff = 3, do.contour = F, fxn.x = expMean, fxn.y = logVarDivMean)
[1] "Calculating gene dispersion"

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%

> combined.data = PCA(combined.data, do.print = FALSE)
> PCAPlot(combined.data, 1, 2, pt.size = 2)

> combined.data = ProjectPCA(combined.data, do.print = FALSE)

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%
> rot = combined.data@pca.rot %>% tibble::rownames_to_column(var = "sample") %>% 
+     left_join(combwelldata, by = "sample")
> ggplot(rot, aes(PC1, PC2, color = batch, label = treatment)) + geom_text(size = 2) + 
+     theme_bw()

If it is not possible to include SC and Myo samples on the single-cell plate, we could also correct for batch if we had a set of DMSO treated samples on the SC and Myo plates, but we need some overlapping samples so we can correct for the batch effect.

> comb1class = combwelldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comb1class) = c("sample", "comb1treat", "comb1class", "batch")
> comb2class = combwelldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comb2class) = c("sample", "comb2treat", "comb2class", "batch")
> pcamat = as.matrix(rot[, c("PC1", "PC2")])
> rownames(pcamat) = rot$sample
> pcadist = as.matrix(dist(pcamat))
> pcadist[diag(pcadist)] = NA
> pcadist = pcadist %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>% 
+     tidyr::gather(sample, distance, -rowname)
> colnames(pcadist) = c("comp1", "comp2", "distance")
> pcadist = pcadist %>% left_join(comb1class, by = c(comp1 = "sample")) %>% left_join(comb2class, 
+     by = c(comp2 = "sample")) %>% group_by(comb1treat, comb2treat) %>% summarise(mtreatdist = mean(distance), 
+     mtreatsd = sd(distance))
> sc = subset(pcadist, comb1treat == "SC") %>% dplyr::arrange(mtreatdist) %>% 
+     mutate(rank = dense_rank(mtreatdist))
> ggplot(sc, aes(rank, mtreatdist, label = comb2treat)) + geom_point() + theme(axis.title.x = element_blank(), 
+     axis.text.x = element_blank(), axis.ticks.x = element_blank()) + ylab("Euclidean distance to SC")

> write_csv(sc, "pca-distance-to-SC.csv")
> comb1class = combwelldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comb1class) = c("sample", "comb1treat", "comb1class", "batch")
> comb2class = combwelldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comb2class) = c("sample", "comb2treat", "comb2class", "batch")
> pcamat = as.matrix(rot[, c("PC1", "PC2")])
> rownames(pcamat) = rot$sample
> pcadist = as.matrix(dist(pcamat))
> pcadist[diag(pcadist)] = NA
> pcadist = pcadist %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>% 
+     tidyr::gather(sample, distance, -rowname)
> colnames(pcadist) = c("comp1", "comp2", "distance")
> pcadist = pcadist %>% left_join(comb1class, by = c(comp1 = "sample")) %>% left_join(comb2class, 
+     by = c(comp2 = "sample")) %>% group_by(comb1treat, comb2treat, batch.y) %>% 
+     summarise(mtreatdist = mean(distance), mtreatsd = sd(distance))
> scall = subset(pcadist, comb1treat == "SC") %>% dplyr::arrange(mtreatdist)
> write_csv(scall, "pca-distance-to-SC-by-batch.csv")